#10982条数据
#2982条长对话
#8000条短对话

import pandas as pd

#导入数据集
train_data=pd.read_json('train_data.json')
instruction=train_data["instruction"]
intput_data=train_data["input"]
output=train_data["output"]

# print(len(intput_data)) #2982

train_file="corpus.txt"
with open(train_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    train_datas_2 = []
    for i in lines:
        i=i.replace("\n","")
        if i!="===":
            train_datas_2.append(i)
    pairs = [train_datas_2[i:i + 2] for i in range(0, len(train_datas_2), 2)]
    for i in pairs[:5]:
        print(i)


with open("dataset.txt",'w',encoding='utf-8') as f:
    for i in range(len(train_data)):
        f.write(instruction[i].replace("\n","")+intput_data[i].replace("\n","")+'\t'+output[i].replace("\n","")+'\n')

    for i in pairs[:8000]:
        f.write(i[0]+'\t'+i[1]+"\n")





